{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Load Data\n",
    "train_df = pd.read_csv(\"../data/train.csv\")\n",
    "test_df = pd.read_csv(\"../data/test.csv\")\n",
    "validation_df = pd.read_csv(\"../data/valid.csv\")\n",
    "y_test = np.zeros(len(test_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def evaluate_recall(y, y_test, k=1):\n",
    "    num_examples = float(len(y))\n",
    "    num_correct = 0\n",
    "    for predictions, label in zip(y, y_test):\n",
    "        if label in predictions[:k]:\n",
    "            num_correct += 1\n",
    "    return num_correct/num_examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def predict_random(context, utterances):\n",
    "    return np.random.choice(len(utterances), 10, replace=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recall @ (1, 10): 0.103541\n",
      "Recall @ (2, 10): 0.205391\n",
      "Recall @ (5, 10): 0.503805\n",
      "Recall @ (10, 10): 1\n"
     ]
    }
   ],
   "source": [
    "# Evaluate Random predictor\n",
    "y_random = [predict_random(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]\n",
    "for n in [1, 2, 5, 10]:\n",
    "    print(\"Recall @ ({}, 10): {:g}\".format(n, evaluate_recall(y_random, y_test, n)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class TFIDFPredictor:\n",
    "    def __init__(self):\n",
    "        self.vectorizer = TfidfVectorizer()\n",
    "\n",
    "    def train(self, data):\n",
    "        self.vectorizer.fit(np.append(data.Context.values,data.Utterance.values))\n",
    "\n",
    "    def predict(self, context, utterances):\n",
    "        # Convert context and utterances into tfidf vector\n",
    "        vector_context = self.vectorizer.transform([context])\n",
    "        vector_doc = self.vectorizer.transform(utterances)\n",
    "        # The dot product measures the similarity of the resulting vectors\n",
    "        result = np.dot(vector_doc, vector_context.T).todense()\n",
    "        result = np.asarray(result).flatten()\n",
    "        # Sort by top results and return the indices in descending order\n",
    "        return np.argsort(result, axis=0)[::-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recall @ (1, 10): 0.495032\n",
      "Recall @ (2, 10): 0.596882\n",
      "Recall @ (5, 10): 0.766121\n",
      "Recall @ (10, 10): 1\n"
     ]
    }
   ],
   "source": [
    "# Evaluate TFIDF predictor\n",
    "pred = TFIDFPredictor()\n",
    "pred.train(train_df)\n",
    "y = [pred.predict(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]\n",
    "for n in [1, 2, 5, 10]:\n",
    "    print(\"Recall @ ({}, 10): {:g}\".format(n, evaluate_recall(y, y_test, n)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
